/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.indexer; import net.nutch.pagedb.*; import net.nutch.linkdb.*; import net.nutch.fetcher.*; import net.nutch.analysis.NutchDocumentAnalyzer; import net.nutch.db.*; import net.nutch.io.*; import net.nutch.util.*; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.util.logging.Logger; import java.util.logging.Level; import java.util.Date; import java.io.File; import java.io.EOFException; /** Creates an index for the output corresponding to a single fetcher run. */ public class IndexSegment { public static final String DONE_NAME = "index.done"; public static final Logger LOG = LogFormatter.getLogger("net.nutch.index.IndexSegment"); private boolean boostByLinkCount = NutchConf.getBoolean("indexer.boost.by.link.count", false); private float scorePower = NutchConf.getFloat("indexer.score.power", 0.5f); private int maxTitleLength = NutchConf.getInt("indexer.max.title.length", 100); private File directory = null; private int maxDocs = Integer.MAX_VALUE; /** Determines the power of link analyis scores. Each pages's boost is * set to <i>score<sup>scorePower</sup></i> where <i>score</i> is its link * analysis score and <i>scorePower</i> is the value passed to this method. */ public void setScorePower(float power) { scorePower = power; } private void indexPages() throws Exception { IndexWriter writer = new IndexWriter(new File(directory, "index"), new NutchDocumentAnalyzer(), true); writer.mergeFactor = 50; writer.minMergeDocs = 50; writer.infoStream = LogFormatter.getLogStream(LOG, Level.INFO); writer.setUseCompoundFile(false); writer.setSimilarity(new NutchSimilarity()); ArrayFile.Reader fetcher = null; ArrayFile.Reader text = null; int count = 0; try { fetcher = new ArrayFile.Reader(new File(directory, FetcherOutput.DIR_NAME).toString()); text = new ArrayFile.Reader(new File(directory,FetcherText.DIR_NAME).toString()); String segmentName = directory.getCanonicalFile().getName(); FetcherOutput fetcherOutput = new FetcherOutput(); FetcherText fetcherText = new FetcherText(); while (fetcher.next(fetcherOutput) != null && count++ < maxDocs) { text.next(fetcherText); if (fetcherOutput.getStatus() != FetcherOutput.SUCCESS) continue; // don't index the page Document doc = makeDocument(segmentName, fetcher.key(), fetcherOutput, fetcherText); writer.addDocument(doc); } } catch (EOFException e) { LOG.warning("Unexpected EOF in: " + directory + " at entry #" + count + ". Ignoring."); } finally { if (fetcher != null) fetcher.close(); if (text != null) text.close(); } LOG.info("Optimizing index..."); writer.optimize(); writer.close(); } private Document makeDocument(String segmentName, long docNo, FetcherOutput fetcherOutput, FetcherText fetcherText) throws Exception { FetchListEntry fle = fetcherOutput.getFetchListEntry(); String url = fle.getPage().getURL().toString(); String title = fetcherOutput.getTitle(); if (title.length() > maxTitleLength) { // truncate title if needed title = title.substring(0, maxTitleLength); } Document doc = new Document(); // url is both stored and indexed, so it's both searchable and returned doc.add(Field.Text("url", url)); // un-indexed fields: not searchable, but in hits and/or used by dedup doc.add(Field.UnIndexed("title", title)); doc.add(Field.UnIndexed("digest", fetcherOutput.getMD5Hash().toString())); doc.add(Field.UnIndexed("docNo", Long.toString(docNo, 16))); doc.add(Field.UnIndexed("segment", segmentName)); // content is indexed, so that it's searchable, but not stored in index doc.add(Field.UnStored("content", fetcherText.getText())); // anchors are indexed, so they're searchable, but not stored in index String[] anchors = fle.getAnchors(); for (int i = 0; i < anchors.length; i++) { doc.add(Field.UnStored("anchor", anchors[i])); } // add title as anchor so it's searchable. doesn't warrant its own field. doc.add(Field.UnStored("anchor", title)); // compute boost // 1. Start with page's score from DB -- 1.0 if no link analysis. float boost = fle.getPage().getScore(); // 2. Apply scorePower to this. boost = (float)Math.pow(boost, scorePower); // 3. Optionally boost by log of incoming anchor count. if (boostByLinkCount) boost *= (float)Math.log(Math.E + anchors.length); // 4. Apply boost to all indexed fields. doc.setBoost(boost); // store boost for use by explain and dedup doc.add(Field.UnIndexed("boost", Float.toString(boost))); return doc; } /** Create an index for the input files in the named directory. */ public static void main(String[] args) throws Exception { String usage = "IndexSegment <segment_directory>"; if (args.length == 0) { System.err.println("Usage: " + usage); return; } IndexSegment indexer = new IndexSegment(); for (int i = 0; i < args.length; i++) { if (args[i].equals("-max")) { // parse -max option indexer.maxDocs = Integer.parseInt(args[++i]); } else if (i != args.length-1) { System.err.println("Usage: " + usage); return; } else { indexer.directory = new File(args[i]); } } // File fetcherDone = new File(indexer.directory, FetcherOutput.DONE_NAME); // if (!fetcherDone.exists()) // check fetcher done file // throw new RuntimeException("can't index--not yet fetched: " + // fetcherDone + " does not exist"); File doneFile = new File(indexer.directory, DONE_NAME); if (doneFile.exists()) // check index done file throw new RuntimeException("already indexed: " + doneFile + " exists"); LOG.info("indexing segment: " + indexer.directory); indexer.indexPages(); doneFile.createNewFile(); // create the done file LOG.info("done indexing"); } }